LSTM supervised model for anomaly detection¶

Import libraries¶

In [ ]:
import numpy as np
import pandas as pd
import os
from time import time

import warnings
warnings.filterwarnings('ignore')

from slidingWindows import find_length,plotFig
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tqdm import tqdm
c:\ProgramData\anaconda3\envs\TSB\lib\site-packages\numpy\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:
c:\ProgramData\anaconda3\envs\TSB\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\ProgramData\anaconda3\envs\TSB\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  warnings.warn("loaded more than 1 DLL from .libs:"

Data and model utils¶

In [ ]:
def data_preprocessing(filepath):

    # Prepare data for unsupervised method
    df = pd.read_csv(filepath, header=None).dropna().to_numpy()

    name = filepath.split('/')[-1]

    data = df[:,0].astype(float)
    label = df[:,1].astype(int)
        
    slidingWindow = find_length(data)

    # Check if "Dodgers" is the first dataset in the filename
    if "normality1_Dodgers" in name or "normality2_Dodgers" in name or "normality3_Dodgers" in name:
        # Skip the first 500 points for this specific case
        data = data[500:]
        label = label[500:]

    data_train = data[:int(0.15*len(data))]
    label_train = label[:int(0.15*len(data))]
    data_test = data[int(0.15*len(data)):]
    label_test = label[int(0.15*len(data)):]

    print(name)
    print("Estimated Subsequence length: ",slidingWindow)
    print("Time series length: ",len(data))
    print("Number of abnormal points: ",list(label).count(1))

    return name, slidingWindow, data, label, data_train, label_train, data_test, label_test

def plot_sequences(X_train, y_train, X_test, y_test, slidingWindow):
    def plot_sequence(X, y, slidingWindow, title):
        plt.figure(figsize=(14, 6))
        
        # Create segments and colors for the LineCollection
        points = np.array([np.arange(len(X)), X]).T.reshape(-1, 1, 2)
        segments = np.concatenate([points[:-1], points[1:]], axis=1)
        colors = ['b' if label == 0 else 'r' for label in y]

        lc = LineCollection(segments, colors=colors, linewidths=2)
        plt.gca().add_collection(lc)

        plt.xlim(0, len(X))
        plt.ylim(min(X) - 1, max(X) + 1)
        plt.title(title)
        plt.xlabel('Index')
        plt.ylabel('Value')
        plt.show()

    # Plot for X_train
    plot_sequence(X_train, y_train, slidingWindow, 'X_train Sequence with Normal (Blue) and Abnormal (Red) Points')

    # Plot for X_test
    plot_sequence(X_test, y_test, slidingWindow, 'X_test Sequence with Normal (Blue) and Abnormal (Red) Points and Sliding Window Lines')
In [ ]:
def create_lstm_model(learning_rate):
    model = tf.keras.models.Sequential([
        tf.keras.layers.LSTM(256),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy')
    return model

def generate_sliding_windows(X_data, y_data, window_size, shift):
    X, y = [], []
    for i in range(len(X_data) - window_size):
        X.append(X_data[i:(i + window_size)])
        y.append(y_data[i + window_size-1])
    return np.array(X), np.array(y)

def train_lstm_model(X_train, y_train, X_val=None, y_val=None, window_size=50,shift=1):

    NUM_EPOCHS = 1000
    LEARNING_RATE = 0.0001

    X_train_samples, y_train_samples = generate_sliding_windows(X_train, y_train, window_size, shift)
    
    # Reshape X_train_samples to fit LSTM input shape
    X_train_samples = X_train_samples.reshape((X_train_samples.shape[0], X_train_samples.shape[1], 1))
    model = create_lstm_model(learning_rate=LEARNING_RATE)

    # Fit the model
    if X_val is not None:
        X_val_samples, y_val_samples = generate_sliding_windows(X_val, y_val, window_size, shift)
        X_val_samples = X_val_samples.reshape((X_val_samples.shape[0], X_val_samples.shape[1], 1))

        # Define callbacks
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, min_delta=0.000001)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7)
        model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss', mode='min')
        
        callbacks = [early_stopping, reduce_lr, model_checkpoint]
        history = model.fit(X_train_samples, y_train_samples, epochs=NUM_EPOCHS, batch_size=256,
                            validation_data=(X_val_samples, y_val_samples), callbacks=callbacks, verbose=0)
    else:

        # Define callbacks
        early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True, min_delta=0.000001)
        reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, min_lr=1e-7)
        model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='loss', mode='min')
        
        callbacks = [early_stopping, reduce_lr, model_checkpoint]
        history = model.fit(X_train_samples, y_train_samples, epochs=NUM_EPOCHS, callbacks=callbacks, batch_size=256, verbose=0)

    return model

def generate_sliding_windows_1(data, window_size, shift):
    X = []
    for i in range(len(data) - window_size + 1):
        X.append(data[i:(i + window_size)])
    return np.array(X)

def process_offline(model, X_test, window_size=50, shift=1):
    # Generate sliding windows from X_test
    X_test_windows = generate_sliding_windows_1(X_test, window_size, shift)
    scores = []

    for i in tqdm(range(len(X_test_windows)), desc="Processing batches"):
        X_batch = X_test_windows[i]
        if len(X_batch) == 0:
            break 
        # Reshape X_batch to fit LSTM input shape
        X_batch = X_batch.reshape((1, X_batch.shape[0], 1))
        
        y_pred = model.predict(X_batch, verbose=0)
        scores.extend(y_pred.ravel())
    
    # Scale scores to range (0, 1)
    scores = MinMaxScaler(feature_range=(0, 1)).fit_transform(np.array(scores).reshape(-1, 1)).ravel()
    return scores

def process_in_batches(model, X_test, batch_size=64, window_size=50, shift=1):
    # Generate sliding windows from X_test
    X_test_windows = generate_sliding_windows_1(X_test, window_size, shift)
    
    n_batches = len(X_test_windows) // batch_size
    scores = []

    for i in tqdm(range(n_batches + 1), desc="Processing batches"):
        start = i * batch_size
        end = start + batch_size
        X_batch = X_test_windows[start:end]
        
        if len(X_batch) == 0:
            break
        
        # Reshape X_batch to fit LSTM input shape
        X_batch = X_batch.reshape((X_batch.shape[0], X_batch.shape[1], 1))
        
        y_pred = model.predict(X_batch, verbose=0)
        scores.extend(y_pred.ravel())
    
    # Scale scores to range (0, 1)
    scores = MinMaxScaler(feature_range=(0, 1)).fit_transform(np.array(scores).reshape(-1, 1)).ravel()
    return scores

# Function to simulate streaming environment with pseudo-labeling and a progress bar
def process_in_batches_with_pseudo_labeling(model, X_test, batch_size=64, confidence_threshold=0.9, window_size=50, shift=1):

    # Generate sliding windows from X_test
    X_test_windows = generate_sliding_windows_1(X_test, window_size, shift)
    
    n_batches = len(X_test_windows) // batch_size
    scores = []

    buffer_X_data = []
    buffer_y_data = []
    for i in tqdm(range(n_batches + 1), desc="Processing batches"):
        start = i * batch_size
        end = start + batch_size
        X_batch = X_test_windows[start:end]
        if len(X_batch) == 0:
            break

        # Reshape X_batch to fit LSTM input shape
        X_batch = X_batch.reshape((X_batch.shape[0], X_batch.shape[1], 1))
        
        y_pred = model.predict(X_batch, verbose=0)
        scores.extend(y_pred.ravel())

        if buffer_X_data==[]:
            buffer_X_data = X_batch
        else: 
            buffer_X_data = np.concatenate([buffer_X_data,X_batch])
        if buffer_y_data==[]:
            buffer_y_data = y_pred
        else: 
            buffer_y_data = np.concatenate([buffer_y_data,y_pred])

        # Identify high confidence predictions
        # Create the new array based on the threshold
        if i%10 == 0 and i!=0:
            y_pred_confident = MinMaxScaler(feature_range=(0,1)).fit_transform(np.array(buffer_y_data).reshape(-1, 1)).ravel()
            y_pred_confident = np.where(y_pred_confident > confidence_threshold, 1, 0)
            model.fit(buffer_X_data, y_pred_confident, epochs=1, verbose=0, batch_size=256)
            buffer_X_data = []
            buffer_y_data = []

    # Scale scores to range (0,1)
    scores = MinMaxScaler(feature_range=(0,1)).fit_transform(np.array(scores).reshape(-1, 1)).ravel()
    return scores

Load datasets and train lstm models on train sets¶

In [ ]:
# Directory containing the .out files
directory = 'generated_data'

# Initialize an empty list to store the file paths
file_paths = []

# Walk through the directory
for filename in os.listdir(directory):
    if filename.endswith('.out'):
        # Construct the full file path
        full_path = os.path.join(directory, filename)
        # Append the full path to the list
        file_paths.append(full_path)
In [ ]:
for filepath in file_paths[:3]:
    name, slidingWindow, data, label, X_train, y_train, X_test, y_test = data_preprocessing(filepath)
    print(f"Total points in train set: {X_train.shape[0]}")
    print(f"Number of abnormal points in train set: {sum(y_train==1)}")
    plot_sequences(X_train, y_train, X_test, y_test, slidingWindow)
generated_data\normality1_Dodgers.out
Estimated Subsequence length:  288
Time series length:  49900
Number of abnormal points:  5233
Total points in train set: 7485
Number of abnormal points in train set: 486
No description has been provided for this image
No description has been provided for this image
generated_data\normality1_MGAB.out
Estimated Subsequence length:  49
Time series length:  100000
Number of abnormal points:  200
Total points in train set: 15000
Number of abnormal points in train set: 0
No description has been provided for this image
No description has been provided for this image
generated_data\normality1_NAB.out
Estimated Subsequence length:  289
Time series length:  4031
Number of abnormal points:  400
Total points in train set: 604
Number of abnormal points in train set: 0
No description has been provided for this image
No description has been provided for this image
In [ ]:
models = {}
In [ ]:
for filepath in file_paths:

    name, slidingWindow, data, label, X_train_val, y_train_val, X_test, y_test = data_preprocessing(filepath)
    if "normality2_Dodgers" in name or "normality3_Dodgers" in name:
        model = models['generated_data\\normality1_Dodgers.out']
    elif "normality2_MGAB" in name or "normality3_MGAB" in name:
        model = models['generated_data\\normality1_MGAB.out']
    elif "normality2_NAB" in name or "normality3_NAB" in name:
        model = models['generated_data\\normality1_NAB.out']
    else:
        start_time = time()

        X_train = X_train_val[:int(0.8*len(X_train_val))]
        y_train = y_train_val[:int(0.8*len(X_train_val))]
        X_val = X_train_val[int(0.8*len(X_train_val)):]
        y_val = y_train_val[int(0.8*len(X_train_val)):]
        model = train_lstm_model(X_train.reshape((-1, 1, 1)), y_train, X_val=X_val.reshape((-1, 1, 1)), y_val=y_val)
        end_time = time()

        print(f"Model for {filepath} trained succesfuly in {(end_time-start_time):.1f} s.")

    models[filepath] = model
generated_data\normality1_Dodgers.out
Estimated Subsequence length:  288
Time series length:  49900
Number of abnormal points:  5233
Model for generated_data\normality1_Dodgers.out trained succesfuly in 119.0 s.
generated_data\normality1_MGAB.out
Estimated Subsequence length:  49
Time series length:  100000
Number of abnormal points:  200
Model for generated_data\normality1_MGAB.out trained succesfuly in 253.8 s.
generated_data\normality1_NAB.out
Estimated Subsequence length:  289
Time series length:  4031
Number of abnormal points:  400
Model for generated_data\normality1_NAB.out trained succesfuly in 19.8 s.
generated_data\normality2_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  149900
Number of abnormal points:  5433
generated_data\normality2_Dodgers_NAB.out
Estimated Subsequence length:  288
Time series length:  53931
Number of abnormal points:  5633
generated_data\normality2_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  150400
Number of abnormal points:  5812
generated_data\normality2_MGAB_NAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
generated_data\normality2_NAB_Dodgers.out
Estimated Subsequence length:  288
Time series length:  54431
Number of abnormal points:  6012
generated_data\normality2_NAB_MGAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
generated_data\normality3_Dodgers_MGAB_NAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
generated_data\normality3_Dodgers_NAB_MGAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
generated_data\normality3_MGAB_Dodgers_NAB.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
generated_data\normality3_MGAB_NAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
generated_data\normality3_NAB_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  154431
Number of abnormal points:  6212
generated_data\normality3_NAB_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212

Offline option¶

In [ ]:
WINDOW_SIZE= 50
In [ ]:
for filepath in file_paths:
    name, slidingWindow, data, label, X_train, y_train, X_test, y_test = data_preprocessing(filepath)
    X_test_reshaped = X_test.reshape((-1, 1, 1))

    model = models[filepath]
    # Process test data in batches to simulate streaming
    score = process_in_batches(model, X_test, batch_size=len(X_test))
    
    zeros = np.zeros(WINDOW_SIZE-1)
    new_score = np.concatenate((zeros,score))
    plotFig(X_test, y_test, new_score, slidingWindow, fileName=name, modelName="LSTM (process in batches)")
generated_data\normality1_Dodgers.out
Estimated Subsequence length:  288
Time series length:  49900
Number of abnormal points:  5233
Processing batches: 100%|██████████| 1/1 [00:43<00:00, 43.24s/it]
generated_data\normality1_MGAB.out
Estimated Subsequence length:  49
Time series length:  100000
Number of abnormal points:  200
Processing batches: 100%|██████████| 1/1 [01:25<00:00, 85.10s/it]
generated_data\normality1_NAB.out
Estimated Subsequence length:  289
Time series length:  4031
Number of abnormal points:  400
Processing batches: 100%|██████████| 1/1 [00:03<00:00,  3.16s/it]
generated_data\normality2_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  149900
Number of abnormal points:  5433
Processing batches: 100%|██████████| 1/1 [01:53<00:00, 113.35s/it]
generated_data\normality2_Dodgers_NAB.out
Estimated Subsequence length:  288
Time series length:  53931
Number of abnormal points:  5633
Processing batches: 100%|██████████| 1/1 [00:40<00:00, 40.95s/it]
generated_data\normality2_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  150400
Number of abnormal points:  5812
Processing batches: 100%|██████████| 1/1 [01:54<00:00, 114.37s/it]
generated_data\normality2_MGAB_NAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
Processing batches: 100%|██████████| 1/1 [01:19<00:00, 79.56s/it]
generated_data\normality2_NAB_Dodgers.out
Estimated Subsequence length:  288
Time series length:  54431
Number of abnormal points:  6012
Processing batches: 100%|██████████| 1/1 [00:40<00:00, 40.98s/it]
generated_data\normality2_NAB_MGAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
Processing batches: 100%|██████████| 1/1 [01:16<00:00, 76.34s/it]
generated_data\normality3_Dodgers_MGAB_NAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
Processing batches: 100%|██████████| 1/1 [02:07<00:00, 127.34s/it]
generated_data\normality3_Dodgers_NAB_MGAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
Processing batches: 100%|██████████| 1/1 [01:54<00:00, 114.70s/it]
generated_data\normality3_MGAB_Dodgers_NAB.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 1/1 [02:03<00:00, 123.17s/it]
generated_data\normality3_MGAB_NAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 1/1 [02:01<00:00, 121.84s/it]
generated_data\normality3_NAB_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 1/1 [02:02<00:00, 122.79s/it]
generated_data\normality3_NAB_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 1/1 [01:51<00:00, 111.96s/it]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Online options¶

Train a simple lstm model and detect anomalies in the test set

In [ ]:
BATCH_SIZE = 1000
WINDOW_SIZE= 50

Variation 1 - Process in batches¶

In [ ]:
for filepath in file_paths:
    name, slidingWindow, data, label, X_train, y_train, X_test, y_test = data_preprocessing(filepath)
    X_test_reshaped = X_test.reshape((-1, 1, 1))

    model = models[filepath]
    # Process test data in batches to simulate streaming
    score = process_in_batches(model, X_test, batch_size=BATCH_SIZE)
    
    zeros = np.zeros(WINDOW_SIZE-1)
    new_score = np.concatenate((zeros,score))
    plotFig(X_test, y_test, new_score, BATCH_SIZE, fileName=name, modelName="LSTM (process in batches)")
generated_data\normality1_Dodgers.out
Estimated Subsequence length:  288
Time series length:  49900
Number of abnormal points:  5233
Processing batches: 100%|██████████| 43/43 [00:37<00:00,  1.16it/s]
generated_data\normality1_MGAB.out
Estimated Subsequence length:  49
Time series length:  100000
Number of abnormal points:  200
Processing batches: 100%|██████████| 85/85 [01:14<00:00,  1.14it/s]
generated_data\normality1_NAB.out
Estimated Subsequence length:  289
Time series length:  4031
Number of abnormal points:  400
Processing batches: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]
generated_data\normality2_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  149900
Number of abnormal points:  5433
Processing batches: 100%|██████████| 128/128 [01:43<00:00,  1.24it/s]
generated_data\normality2_Dodgers_NAB.out
Estimated Subsequence length:  288
Time series length:  53931
Number of abnormal points:  5633
Processing batches: 100%|██████████| 46/46 [00:36<00:00,  1.27it/s]
generated_data\normality2_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  150400
Number of abnormal points:  5812
Processing batches: 100%|██████████| 128/128 [01:43<00:00,  1.23it/s]
generated_data\normality2_MGAB_NAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
Processing batches: 100%|██████████| 89/89 [01:12<00:00,  1.24it/s]
generated_data\normality2_NAB_Dodgers.out
Estimated Subsequence length:  288
Time series length:  54431
Number of abnormal points:  6012
Processing batches: 100%|██████████| 47/47 [01:06<00:00,  1.42s/it]
generated_data\normality2_NAB_MGAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
Processing batches: 100%|██████████| 89/89 [02:12<00:00,  1.49s/it]
generated_data\normality3_Dodgers_MGAB_NAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
Processing batches: 100%|██████████| 131/131 [03:11<00:00,  1.46s/it]
generated_data\normality3_Dodgers_NAB_MGAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
Processing batches: 100%|██████████| 131/131 [03:12<00:00,  1.47s/it]
generated_data\normality3_MGAB_Dodgers_NAB.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [03:08<00:00,  1.43s/it]
generated_data\normality3_MGAB_NAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [02:58<00:00,  1.35s/it]
generated_data\normality3_NAB_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [03:20<00:00,  1.52s/it]
generated_data\normality3_NAB_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [03:01<00:00,  1.37s/it]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Variation 2 - Process in batches with pseudo labeling¶

In [ ]:
for filepath in file_paths:
    name, slidingWindow, data, label, X_train, y_train, X_test, y_test = data_preprocessing(filepath)
    X_test_reshaped = X_test.reshape((-1, 1, 1))

    model = models[filepath]
    # Process test data in batches to simulate streaming
    score = process_in_batches_with_pseudo_labeling(model, X_test, batch_size=BATCH_SIZE, confidence_threshold=0.8)

    zeros = np.zeros(WINDOW_SIZE-1)
    new_score = np.concatenate((zeros,score))
    plotFig(X_test, y_test, new_score, BATCH_SIZE, fileName=name, modelName="LSTM (pseudo labeling)")
generated_data\normality1_Dodgers.out
Estimated Subsequence length:  288
Time series length:  49900
Number of abnormal points:  5233
Processing batches: 100%|██████████| 43/43 [01:57<00:00,  2.74s/it]
generated_data\normality1_MGAB.out
Estimated Subsequence length:  49
Time series length:  100000
Number of abnormal points:  200
Processing batches: 100%|██████████| 85/85 [03:51<00:00,  2.73s/it]
generated_data\normality1_NAB.out
Estimated Subsequence length:  289
Time series length:  4031
Number of abnormal points:  400
Processing batches: 100%|██████████| 4/4 [00:04<00:00,  1.12s/it]
generated_data\normality2_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  149900
Number of abnormal points:  5433
Processing batches: 100%|██████████| 128/128 [05:01<00:00,  2.36s/it]
generated_data\normality2_Dodgers_NAB.out
Estimated Subsequence length:  288
Time series length:  53931
Number of abnormal points:  5633
Processing batches: 100%|██████████| 46/46 [01:36<00:00,  2.10s/it]
generated_data\normality2_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  150400
Number of abnormal points:  5812
Processing batches: 100%|██████████| 128/128 [05:17<00:00,  2.48s/it]
generated_data\normality2_MGAB_NAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
Processing batches: 100%|██████████| 89/89 [03:34<00:00,  2.41s/it]
generated_data\normality2_NAB_Dodgers.out
Estimated Subsequence length:  288
Time series length:  54431
Number of abnormal points:  6012
Processing batches: 100%|██████████| 47/47 [01:40<00:00,  2.14s/it]
generated_data\normality2_NAB_MGAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
Processing batches: 100%|██████████| 89/89 [03:22<00:00,  2.28s/it]
generated_data\normality3_Dodgers_MGAB_NAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
Processing batches: 100%|██████████| 131/131 [05:08<00:00,  2.35s/it]
generated_data\normality3_Dodgers_NAB_MGAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
Processing batches: 100%|██████████| 131/131 [05:09<00:00,  2.36s/it]
generated_data\normality3_MGAB_Dodgers_NAB.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [05:13<00:00,  2.38s/it]
generated_data\normality3_MGAB_NAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [07:34<00:00,  3.44s/it]
generated_data\normality3_NAB_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [08:11<00:00,  3.73s/it]
generated_data\normality3_NAB_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [08:35<00:00,  3.90s/it]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Variation 3 - Process with general model in batches¶

In [ ]:
combined_X_train_data = []
combined_X_val_data = []
combined_y_train_data = []
combined_y_val_data = []

for filepath in file_paths[:3]:

    name, slidingWindow, data, label, X_train_val, y_train_val, X_test, y_test = data_preprocessing(filepath)

    X_train = X_train_val[:int(0.8*len(X_train_val))]
    y_train = y_train_val[:int(0.8*len(X_train_val))]
    X_val = X_train_val[int(0.8*len(X_train_val)):]
    y_val = y_train_val[int(0.8*len(X_train_val)):]    
    combined_X_train_data.append(X_train)
    combined_y_train_data.append(y_train)
    combined_X_val_data.append(X_val)
    combined_y_val_data.append(y_val)

X_train_combined = np.concatenate(combined_X_train_data)
y_train_combined = np.concatenate(combined_y_train_data)
X_val_combined = np.concatenate(combined_X_val_data)
y_val_combined = np.concatenate(combined_y_val_data)

start_time = time()
combined_model = train_lstm_model(X_train_combined.reshape((-1, 1, 1)), y_train_combined, X_val=X_val_combined.reshape((-1, 1, 1)), y_val=y_val_combined)
end_time = time()
models['Combined_data'] = combined_model
print(f"General model trained succesfuly in {(end_time-start_time):.1f} s.")
generated_data\normality1_Dodgers.out
Estimated Subsequence length:  288
Time series length:  49900
Number of abnormal points:  5233
generated_data\normality1_MGAB.out
Estimated Subsequence length:  49
Time series length:  100000
Number of abnormal points:  200
generated_data\normality1_NAB.out
Estimated Subsequence length:  289
Time series length:  4031
Number of abnormal points:  400
General model trained succesfuly in 535.0 s.
In [ ]:
for filepath in file_paths:
    name, slidingWindow, data, label, X_train, y_train, X_test, y_test = data_preprocessing(filepath)
    X_test_reshaped = X_test.reshape((-1, 1, 1))

    model = models['Combined_data']
    # Process test data in batches to simulate streaming
    score = process_in_batches(model, X_test, batch_size=BATCH_SIZE)
    
    zeros = np.zeros(WINDOW_SIZE-1)
    new_score = np.concatenate((zeros,score))
    plotFig(X_test, y_test, new_score, BATCH_SIZE, fileName=name, modelName="LSTM (general model)")
generated_data\normality1_Dodgers.out
Estimated Subsequence length:  288
Time series length:  49900
Number of abnormal points:  5233
Processing batches: 100%|██████████| 43/43 [01:00<00:00,  1.40s/it]
generated_data\normality1_MGAB.out
Estimated Subsequence length:  49
Time series length:  100000
Number of abnormal points:  200
Processing batches: 100%|██████████| 85/85 [02:11<00:00,  1.55s/it]
generated_data\normality1_NAB.out
Estimated Subsequence length:  289
Time series length:  4031
Number of abnormal points:  400
Processing batches: 100%|██████████| 4/4 [00:05<00:00,  1.25s/it]
generated_data\normality2_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  149900
Number of abnormal points:  5433
Processing batches: 100%|██████████| 128/128 [03:33<00:00,  1.66s/it]
generated_data\normality2_Dodgers_NAB.out
Estimated Subsequence length:  288
Time series length:  53931
Number of abnormal points:  5633
Processing batches: 100%|██████████| 46/46 [01:12<00:00,  1.57s/it]
generated_data\normality2_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  150400
Number of abnormal points:  5812
Processing batches: 100%|██████████| 128/128 [03:11<00:00,  1.50s/it]
generated_data\normality2_MGAB_NAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
Processing batches: 100%|██████████| 89/89 [02:16<00:00,  1.54s/it]
generated_data\normality2_NAB_Dodgers.out
Estimated Subsequence length:  288
Time series length:  54431
Number of abnormal points:  6012
Processing batches: 100%|██████████| 47/47 [01:17<00:00,  1.64s/it]
generated_data\normality2_NAB_MGAB.out
Estimated Subsequence length:  49
Time series length:  104031
Number of abnormal points:  600
Processing batches: 100%|██████████| 89/89 [01:27<00:00,  1.02it/s]
generated_data\normality3_Dodgers_MGAB_NAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
Processing batches: 100%|██████████| 131/131 [02:03<00:00,  1.06it/s]
generated_data\normality3_Dodgers_NAB_MGAB.out
Estimated Subsequence length:  288
Time series length:  153931
Number of abnormal points:  5833
Processing batches: 100%|██████████| 131/131 [01:47<00:00,  1.22it/s]
generated_data\normality3_MGAB_Dodgers_NAB.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [02:06<00:00,  1.04it/s]
generated_data\normality3_MGAB_NAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [01:57<00:00,  1.12it/s]
generated_data\normality3_NAB_Dodgers_MGAB.out
Estimated Subsequence length:  288
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [02:00<00:00,  1.09it/s]
generated_data\normality3_NAB_MGAB_Dodgers.out
Estimated Subsequence length:  49
Time series length:  154431
Number of abnormal points:  6212
Processing batches: 100%|██████████| 132/132 [02:05<00:00,  1.05it/s]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image